
#delimit;

clear;
clear matrix;
set memory 1000m;
set more off;

#delimit;
global MY_OUT_PATH  "[path where your master data set will be stored]";
global MY_TEMP_PATH "[path where your temporary files are stored]";


cap log close;  
use  "${MY_OUT_PATH}NEW_CNEF2.dta", clear;

*************************;
* create and rename some variables for project;
*************************;

#delimit;
gen male=.;
replace male=1 if sex==1;
replace male=0 if sex==2;

#delimit;
tab d11104;

#delimit;
gen married=0;
replace married=1 if d11104==1;
gen single=0;
replace single=1 if d11104==2;
gen widowed=0;
replace widowed=1 if d11104==3;
gen divorced=0;
replace divorced=1 if d11104==4;
gen separated=0;
replace separated=1 if d11104==5;

#delimit;
rename d11109 yearseduc;

#delimit;
replace gebjahr=. if gebjahr==-1;
gen age=year-gebjahr;


*************************;
* East/West in 1989;
*************************;

#delimit;
gen East1989=.;
replace East1989=1 if loc1989==1;
replace East1989=0 if loc1989==2;



*************************;
* create labor force participation variable;
*************************;


#delimit;
gen margwork=0;
replace margwork=1 if empstat==4;

#delimit;
gen unemployed1=0;
replace unemployed1=1 if unemp==1;
tab unemployed1;


#delimit;
gen workingLFS=0;
replace workingLFS=1 if (lfs==11 | lfs==5 | lfs==12);
replace workingLFS=0 if margwork==1;
replace workingLFS=0 if unemployed1==1;
tab workingLFS;

* unemployed comes from lfs;
#delimit;
gen unemployed=0;
replace unemployed=1 if lfs==6;
replace unemployed=1 if unemployed1==1;
tab unemployed;

* labor force participation;
#delimit;
gen lfp=0;
replace lfp=1 if (workingLFS==1 | unemployed==1);


#delimit;
* part-time/full-time;
gen parttime=0;
replace parttime=. if empstat==.;
replace parttime=1 if (empstat==2 | empstat==12);
tab parttime;

#delimit;
gen fulltime=0;
replace fulltime=. if empstat==.;
replace fulltime=1 if (empstat==1 | empstat==1);
tab fulltime;

#delimit;
drop empstat;


* we define working below based on positive individual hours worked;

#delimit;
replace tatzeit=0 if tatzeit==-2;
replace tatzeit=. if (tatzeit==-3 | tatzeit==-1);

* 75th percentile of parttime==1 give 30 hours;
#delimit;
sum tatzeit if parttime==1, detail; 
sum tatzeit if fulltime==1, detail;

*************************;
* self employment;
*************************;
#delimit;
gen selfemployed=0;
replace selfemployed=. if (selfemp==. | selfemp==-3 | selfemp==-1);
replace selfemployed=1 if (selfemp>0 & selfemp~=.);
tab selfemployed;

drop selfemp;

*************************;
* cleaning income variables;
* income variables East missing in 1990 and 1991;
* some missing observations in individual income components in other years and samples;
*************************;

#delimit;
replace i11101=. if i11101==-2;
replace i11102=. if i11102==-2; 
replace i11103=. if i11103==-2;
replace i11104=. if i11104==-2; 
replace i11105=. if i11105==-2; 
replace i11106=. if i11106==-2; 
replace i11107=. if i11107==-2; 
replace i11108=. if i11108==-2; 
replace i11109=. if i11109==-2; 
replace i11110=. if i11110==-2; 
replace i11117=. if i11117==-2; 
replace ijob1=. if ijob1==-2; 
replace ijob2=. if ijob2==-2; 
replace iself=. if iself==-2; 
replace i13ly=. if i13ly==-2; 
replace i14ly=. if i14ly==-2; 
replace ixmas=. if ixmas==-2; 
replace iholy=. if iholy==-2; 
replace igray=. if igray==-2; 
replace iothy=. if iothy==-2; 
replace divdy=. if divdy==-2; 

* i11117 missing and reported as 0 in 1984 and 1985;
* iothy reported as 0 or -2 in 1984;
# delimit;
replace iothy=0 if year==1984;
*imilt has 0 and -2 until 1995 in all samples;
replace imilt=0 if (year>=1984  & year<=1995);

* divdy has 0 instead of -2 in 1990 East, but is available as only income variable in East in 1991;
replace divdy=. if (psample==3 & year==1990);


*************************;
* cleaning hours and earnings variables;
**************************;

#delimit;
* if hours are -1 and earnings are 0 or do not apply, I set hours=0;
replace e11101=0 if (e11101==-1 & (i11110==0 | i11110==.));
sum e11101;

#delimit;
replace e11101=. if e11101==-1;

#delimit;
replace i11110=0 if (e11101==. & i11110==.);
replace e11101=0 if (e11101==. & i11110==0);
count if e11101~=. & i11101==.;

* if hours are 0 & working is 0, I set these earnings to 0, otherwise I keep as missing;
replace i11110=0 if (i11110==. & working==0 & e11101==0);


*************************;
* create and rename income variables for project;
*************************;

#delimit;
gen taxtransNOM = i11107-i11109+i11108;
gen HHpreincNOM = i11101-i11104;
gen HHpreinc2NOM = i11103+i11106+i11117;
gen HHpostincNOM = i11102-i11104;
gen HHpostinc2NOM = i11101-i11104+i11107-i11109+i11108;
gen HHpreincplustaxesNOM = i11101-i11104-i11109;  
gen HHpreincplustransfersNOM = i11101-i11104+i11107+i11108;	
gen HHallpreincplustaxesNOM = i11101-i11109;


#delimit;
gen wageNOM=i11110/e11101 if (i11110>0 & e11101>0 & i11110~=.);
* set wage=0 if earnings and hours =0;
replace wageNOM=0 if (i11110==0 & e11101==0);

#delimit;
rename i11106 privtransNOM;
rename i11103 HHearningNOM;
rename i11110 INDearningNOM;
rename e11101 INDhours;
rename hhnettoincmonth hhnettoincmonthNOM;
rename opery operyNOM;
rename i11101 HHpreincSOEPNOM;
rename i11102 HHpostincSOEPNOM;

sum HHpreincNOM HHpreinc2NOM HHpostincNOM HHpostinc2NOM HHpreincplustaxesNOM HHallpreincplustaxesNOM HHpreincplustransfersNOM HHearningNOM INDearningNOM INDhours wageNOM HHpreincSOEPNOM HHpostincSOEPNOM;


* Define anyone as working who reports positive hours;

gen working=0;
replace working=1 if (INDhours>0 & INDhours~=.);


*************************;
* financial asset income;
*************************;

#delimit;
replace divdy =. if (psample==3 & (year==1990 | year==1991));
* calculate interest payment on consumer credit from 1997 on;											


gen credit_interest_month = creditdebtamount*0.1;
replace credit_interest_month = 0 if creditdebtamount==-2;
replace credit_interest_month = 0 if (credit_interest_month<0 & creditdebtyn==2);
replace credit_interest_month = . if credit_interest_month<0;
gen credit_interest = credit_interest_month*12;
replace credit_interest = 0 if year<1997;

gen HHfinincNOM = divdy;
replace HHfinincNOM = divdy-credit_interest if year>=1997;




*************************;
* create real income variables ;
*************************;

#delimit;
replace y11101=. if y11101==-2;

gen taxtrans = taxtransNOM/y11101*100 ;
gen HHpreinc = HHpreincNOM/y11101*100 ;
gen HHpostinc = HHpostincNOM/y11101*100 ;
gen wage = wageNOM/y11101*100;
gen privtrans = privtransNOM/y11101*100;
gen HHearning = HHearningNOM/y11101*100;
gen INDearning = INDearningNOM/y11101*100;
gen HHfininc = HHfinincNOM/y11101*100;
gen credit_interestREAL = credit_interest/y11101*100;
gen rentyREAL = renty/y11101*100;
gen i11104REAL = i11104/y11101*100;
gen i11105REAL = i11105/y11101*100;
gen HHpreincplustaxes = HHpreincplustaxesNOM/y11101*100 ;
gen HHpreincplustransfers = HHpreincplustransfersNOM/y11101*100 ;
gen HHallpreincplustaxes = HHallpreincplustaxesNOM/y11101*100 ;
gen opery = operyNOM/y11101*100 ;
gen HHpreincSOEP = HHpreincSOEPNOM/y11101*100 ;
gen HHpostincSOEP = HHpostincSOEPNOM/y11101*100 ;


#delimit;
* hhnettoincmonth needs to be converted into Euro as well;
replace hhnettoincmonthNOM=. if hhnettoincmonthNOM<=0;
gen hhnettoincmonth=hhnettoincmonthNOM/y11101*100;
gen hhnettoinc=hhnettoincmonth*12;


* 0 or negative HHpostinc is implausible and set to missing ;
replace HHpostinc=. if HHpostinc<=0;
replace HHpostincSOEP=. if HHpostincSOEP<=0;

#delimit;
sum HHpreincSOEP HHpostincSOEP HHpreinc HHpostinc HHpreincplustaxes HHallpreincplustaxes HHpreincplustransfers hhnettoinc HHearning INDearning wage HHfininc;


*************************;
* create dummies as controls for highest education;
*************************;
#delimit;
** I'm looking for the HIGHEST degree (defined according to the following implicit ladder) ;

generate college=0;
generate vocational=0;
generate school=0;
generate noschool=0;


replace college=1 if ((pbbil02>=1 & pbbil02<=5) | pbbila==4 | pbbil03==3);

#delimit;
replace vocational=1 if ((pbbil01>0 & pbbil01<7) | (pbbilo>=1 & pbbilo<=4) | pbbil03==2 | (pbbila>=2 & pbbila<=3));
replace vocational=0 if college==1;
replace vocational=0 if pbbil03==1;

#delimit;
replace school=1 if ((psbil>=1 & psbil<=5) | (psbilo>=1 & psbilo<=4) | (psbila>=2 & psbila<=3) );
replace school=0 if (vocational==1 | college==1);

#delimit;
replace noschool=1 if (psbil==6 | psbil==7 | psbilo==5 | psbila==1);
replace noschool=0 if (school==1 | vocational==1 | college==1);

sum  college vocational school  noschool;
 

* flag individuals for whom education is missing;
#delimit;
gen educationmiss=0;
replace educationmiss=1 if (college==0 & vocational==0 & school==0 & noschool==0);
sum pb* ps* if educationmiss==1;


*************************;
* GDR education;
*************************;

#delimit;
gen GDReducation=0;
replace GDReducation=1 if ((psbilo>=1 & psbilo<=5) | (pbbilo>=1 & pbbilo<=4) | pbbil02==4 | pbbil02==5);
replace GDReducation=. if educationmiss==1;


*************************;
* East/West in 1989;
*************************;

#delimit;
* for some who refuse to answer loc1989, we can use GDReducation for assignment:;
replace East1989=1 if loc1989<0 & GDReducation==1;
replace East1989=0 if loc1989<0 & GDReducation==0;



*************************;
* current residence in East or West;
*************************;

#delimit;
replace bula=. if bula<0;
gen CurrentEast=0;
replace CurrentEast=1 if (bula>=11 & bula<=16);
replace CurrentEast=. if bula==.;



*************************;
* OECD Equivalence Scale for Household;
*************************;
#delimit;
* we count as children ages 0-16, and as adults ages 17 and up;

preserve; 
collapse (count) age if age==17, by(yearhhnr);
rename age aged17;
sort yearhhnr;
save "${MY_TEMP_PATH}aged17.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}aged17.dta";
tab _merge;
drop _merge;

replace aged17=0 if aged17==.;

#delimit;
gen adults=d11106-d11107+aged17;
gen children=d11107-aged17;
drop aged17;

gen OECDscale=1+0.7*(adults-1)+0.5*children;

*************************;
* create dummies for household composition;
*************************;

#delimit;
gen adults1kids0=0;
gen adults1kids1=0;
gen adults1kids2=0;
gen adults2kids0=0;
gen adults2kids1=0;
gen adults2kids2=0;
gen adults3kids0=0;
gen adults3kids1=0;
gen adults3kids2=0;

replace adults1kids0=1 if (adults==1 & children==0);
replace adults1kids1=1 if (adults==1 & children==1);
replace adults1kids2=1 if (adults==1 & children>=2);
replace adults2kids0=1 if (adults==2 & children==0);
replace adults2kids1=1 if (adults==2 & children==1);
replace adults2kids2=1 if (adults==2 & children>=2);
replace adults3kids0=1 if (adults>=3 & children==0);
replace adults3kids1=1 if (adults>=3 & children==1);
replace adults3kids2=1 if (adults>=3 & children>=2);

replace adults1kids0=. if (adults==. | children==.);
replace adults1kids1=. if (adults==. | children==.);
replace adults1kids2=. if (adults==. | children==.);
replace adults2kids0=. if (adults==. | children==.);
replace adults2kids1=. if (adults==. | children==.);
replace adults2kids2=. if (adults==. | children==.);
replace adults3kids0=. if (adults==. | children==.);
replace adults3kids1=. if (adults==. | children==.);
replace adults3kids2=. if (adults==. | children==.);
sum adults*;

*************************;
* adjust income variables by OECDscale;
*************************;

#delimit;
gen taxtransOECD = taxtrans/OECDscale;
gen HHpreincOECD = HHpreinc/OECDscale;
gen HHpostincOECD = HHpostinc/OECDscale ;
gen privtransOECD = privtrans/OECDscale;
gen HHearningOECD = HHearning/OECDscale;
gen HHpreincSOEPOECD = HHpreincSOEP/OECDscale;
gen HHpostincSOEPOECD = HHpostincSOEP/OECDscale ;

sum HHpostincSOEPOECD  HHpreincSOEPOECD HHpreinc HHpreincOECD HHpostinc HHpostincOECD HHearning HHearningOECD ;


*************************;
* generate logs of relevant variables;
*************************;
#delimit;
gen logtaxtrans = log(taxtrans);
gen logwage = log(wage) ;
gen logINDearning = log(INDearning) ;
gen loghhnettoinc = log(hhnettoinc);
gen logtaxtransOECD = log(taxtransOECD) ;
gen logprivtransOECD = log(privtransOECD) ;




****************;
* imputedrent;
****************;
#delimit;
gen ownhouse=0;
replace ownhouse=1 if eigen==1;
replace ownhouse=. if eigen==.;
replace imputedrent = 0 if (imputedrentdk==-2 & imputedrent<=0);
replace imputedrent = 0 if ownhouse==0;

#delimit;
replace imputedrent = . if imputedrent<=-2;
gen imputedrentannual = imputedrent*12;
replace imputedrentannual = . if year==1984;

* no imputed rent for 2003 & 2004: take average value of 2002 & 2005;
#delimit;
preserve;
collapse (mean) imputedrentannual if (year==2002 | year==2005), by(persnr);
rename imputedrentannual imputedrent200205;
sort persnr; 
save "${MY_TEMP_PATH}imputedrent200205.dta", replace;
restore;
#delimit;
sort persnr;
merge m:1 persnr using "${MY_TEMP_PATH}imputedrent200205.dta";
tab _merge;
drop _merge;
replace imputedrentannual = imputedrent200205 if (year==2003 | year==2004);


* if imputedrent is 0 before a miss, and imputedrent and ownhouse missing afterwards, we set it to zero for miss;
#delimit;
sort persnr year;
replace imputedrentannual = 0 if (ownhouse==. & imputedrentannual==. & imputedrentannual[_n-1]==0 &  ownhouse[_n-1]==0 & cid==cid[_n-1]); 

****************;
* IMPUTE MORTGAGE INTEREST PAYMENTS;
****************;
#delimit;
replace einzug=. if einzug<0;

****************;
* already converted into Euro - deflate;
****************;

replace mortgageown = mortgageown/y11101*100 if mortgageown>0;
replace mortgageother = mortgageother/y11101*100 if mortgageother>0;
replace imputedrentannual = imputedrentannual/y11101*100;

****************;
* determine start year of mortgage;
****************;
#delimit;
gen yearmortgage=.;
replace yearmortgage=year-einzug+1 if einzug>0 & mortgageown>0;

***************;
* determine start year of mortgage for previous renters, now owners;
***************;
#delimit;
sort persnr year;

gen boughtfromowner=0;
replace boughtfromowner=1 if newowner==1;

#delimit;
gen einzughelp=einzug;
replace einzughelp=year if boughtfromowner==1;
replace einzughelp=einzughelp[_n-1] if persnr==persnr[_n-1] & boughtfromowner[_n-1]==1;
replace boughtfromowner=boughtfromowner[_n-1] if persnr==persnr[_n-1] & boughtfromowner[_n-1]==1;
replace einzughelp=einzughelp[_n-1] if persnr==persnr[_n-1] & boughtfromowner[_n-1]==1;
replace yearmortgage=year-einzughelp+1 if boughtfromowner==1 & mortgageown>0;
drop einzughelp;

* calculate maximum of stated yearmortgage;
#delimit;
preserve;
collapse (max) yearmortgage, by(persnr);
rename yearmortgage yearmortgagemax;
sort persnr;
save "${MY_TEMP_PATH}yearmortgagemax.dta", replace;
restore;
#delimit;
sort persnr;
merge m:1 persnr using "${MY_TEMP_PATH}yearmortgagemax.dta";
tab _merge;
drop _merge;

****************;
* assume overall duration of mortgage and interest rate;
****************;
#delimit;
gen durationmort = 30 ;
replace durationmort = yearmortgage if yearmortgage>30 & yearmortgagemax<=40;
gen intmort = 0.0825;



****************;
* calculate yearly amortization for owner occupied housing;
****************;
#delimit;
gen amortization_own = mortgageown/(1+9.79) if yearmortgage==1;
replace amortization_own = mortgageown/(1+8.96) if yearmortgage==2;
replace amortization_own = mortgageown/(1+8.20) if yearmortgage==3;
replace amortization_own = mortgageown/(1+7.50) if yearmortgage==4;
replace amortization_own = mortgageown/(1+6.85) if yearmortgage==5;
replace amortization_own = mortgageown/(1+6.26) if yearmortgage==6;
replace amortization_own = mortgageown/(1+5.70) if yearmortgage==7;
replace amortization_own = mortgageown/(1+5.19) if yearmortgage==8;
replace amortization_own = mortgageown/(1+4.72) if yearmortgage==9;
replace amortization_own = mortgageown/(1+4.28) if yearmortgage==10;
replace amortization_own = mortgageown/(1+3.88) if yearmortgage==11;
replace amortization_own = mortgageown/(1+3.51) if yearmortgage==12;
replace amortization_own = mortgageown/(1+3.17) if yearmortgage==13;
replace amortization_own = mortgageown/(1+2.85) if yearmortgage==14;
replace amortization_own = mortgageown/(1+2.56) if yearmortgage==15;
replace amortization_own = mortgageown/(1+2.28) if yearmortgage==16;
replace amortization_own = mortgageown/(1+2.03) if yearmortgage==17;
replace amortization_own = mortgageown/(1+1.80) if yearmortgage==18;
replace amortization_own = mortgageown/(1+1.59) if yearmortgage==19;
replace amortization_own = mortgageown/(1+1.39) if yearmortgage==20;
replace amortization_own = mortgageown/(1+1.21) if yearmortgage==21;
replace amortization_own = mortgageown/(1+1.04) if yearmortgage==22;
replace amortization_own = mortgageown/(1+0.89) if yearmortgage==23;
replace amortization_own = mortgageown/(1+0.74) if yearmortgage==24;
replace amortization_own = mortgageown/(1+0.61) if yearmortgage==25;
replace amortization_own = mortgageown/(1+0.49) if yearmortgage==26;
replace amortization_own = mortgageown/(1+0.37) if yearmortgage==27;
replace amortization_own = mortgageown/(1+0.27) if yearmortgage==28;
replace amortization_own = mortgageown/(1+0.17) if yearmortgage==29;
replace amortization_own = mortgageown/(1+0.08) if yearmortgage==30;

gen nopaymort=.;
replace nopaymort=1 if (paymortgage==-2 | paymortgage==2);

#delimit;
replace amortization_own = 0 if (nopaymort==1 & mortgageown<=0 );
replace amortization_own = . if (paymortgage==1 & mortgageown<=0 & year>=1999);
replace amortization_own = 0 if (paymortgage==-1 & mortgageown<=0);
replace amortization_own = . if (mortgageown==-3 & year<=1998);
replace amortization_own = 0 if mortgageown==-2;
replace  amortization_own=mortgageown if yearmortgagemax>30 & yearmortgage>30 & yearmortgagemax<=40 & mortgageown>0;
replace  amortization_own=mortgageown/(1+2.41) if yearmortgagemax>40 & yearmortgagemax~=. & mortgageown>0;


****************;
* calculate yearly amortization for other dwellings;
****************;
#delimit;
generate mortgageothermonth=mortgageother/12;

* this variable is missing for 1991: take average of 1990 and 1992 for all except east sample;
#delimit;
preserve;
collapse (mean) mortgageothermonth if ((year==1990 | year==1992) & psample~=3), by(persnr);
rename mortgageothermonth mortgageothermonth9092;
sort persnr; 
save "${MY_TEMP_PATH}mortgageothermonth9092.dta", replace;
restore;
#delimit;
sort persnr;
merge m:1 persnr using "${MY_TEMP_PATH}mortgageothermonth9092.dta";
tab _merge;
drop _merge;
replace mortgageothermonth = mortgageothermonth9092 if (year==1991 & psample~=3);


gen amortization_other = mortgageothermonth/(1+2.41);
replace amortization_other = 0 if mortgageother<=0;

****************;
* interest is different between mortgage payment and amortization;
****************;
# delimit;
gen interest_own_month = mortgageown-amortization_own;
replace interest_own_month = 0 if mortgageown<=0;
gen interest_other_month = mortgageothermonth - amortization_other; 
replace interest_other_month = 0 if mortgageother<=0;
 
****************;
* go from monthly to annual;
****************;
#delimit;
gen interest_own = interest_own_month * 12;
replace interest_own = . if year==1984;
gen interest_other = interest_other_month *12;
replace interest_other = . if year==1984;

gen mortgageinterest = interest_own + interest_other;
replace mortgageinterest = . if year==1984;


****************;
* imputation of rentyREAL for 1991;
****************;

#delimit;
* there is something wrong in imputation of renty for 1991 in CNEF file;
* both in means and standard deviation, it does not match very well, and we redo it;
preserve;
collapse (mean) rentyREAL if ((year==1990 | year==1992) & psample~=3), by(persnr);
rename rentyREAL rentyREAL9092;
sort persnr; 
save "${MY_TEMP_PATH}rentyREAL9092.dta", replace;
restore;
#delimit;
sort persnr;
merge m:1 persnr using "${MY_TEMP_PATH}rentyREAL9092.dta";
tab _merge;
drop _merge;
replace rentyREAL = rentyREAL9092 if (year==1991 & psample~=3);


****************;
* get total household asset income;
****************;
#delimit;
gen HHrealassetinc = imputedrentannual + rentyREAL - mortgageinterest;
replace HHrealassetinc =. if year==1984;
replace HHfininc=. if year==1984;
gen HHassetincFS = HHfininc + HHrealassetinc;


* alternative based more on CNEF, coming as close to our definition as we can;
* set missing values to zero here, to increase number of observations;

replace interest_other = 0 if mortgageother==.;
replace credit_interestREAL = 0 if credit_interestREAL==.;
gen HHassetinc = i11104REAL + i11105REAL - interest_other - credit_interestREAL + opery;
replace HHassetinc=. if year==1984;
#delimit;
gen HHassetincminusopery=i11104REAL + i11105REAL - interest_other - credit_interestREAL ;


* alternative simply following CNEF, not paying attention to our definition;

gen HHassetincCNEF = i11104REAL + i11105REAL;
replace HHassetincCNEF=. if year==1984;
 

#delimit;
gen HHrealassetincOECD = HHrealassetinc /OECDscale;
gen HHfinincOECD = HHfininc /OECDscale;
gen HHassetincFSOECD = HHassetincFS /OECDscale;
gen HHassetincOECD = HHassetinc /OECDscale;
gen HHassetincminusoperyOECD = HHassetincminusopery /OECDscale;
gen HHassetincCNEFOECD = HHassetincCNEF /OECDscale;


gen logHHrealassetinc  = log(HHrealassetinc);
gen logHHfininc  = log(HHfininc);
gen logHHassetincFS  = log(HHassetincFS);
gen logHHassetinc  = log(HHassetinc);
gen logHHassetincminusopery  = log(HHassetincminusopery);
gen logHHassetincCNEF = log(HHassetincCNEF);


*************************;
* Final income variables as defined in 2008;
*************************;

#delimit;
* HHearning is fine;
rename HHpreinc HHpreearning;
gen HHpreinc = HHpreearning + HHassetinc;
replace HHpostinc = HHpostinc + HHassetinc; 
sum HHpostinc HHassetinc HHpreinc HHpreearning;

#delimit;
gen HHpreincCNEF = HHpreearning + HHassetincCNEF;
gen HHpostincCNEF = HHpostincNOM/y11101*100 + HHassetincCNEF; 

#delimit;
gen logHHearning = log(HHearning);
gen logHHpreearning = log(HHpreearning);
gen logHHpreinc = log(HHpreinc);
gen logHHpostinc = log(HHpostinc);
gen logHHpreincCNEF = log(HHpreincCNEF);
gen logHHpostincCNEF = log(HHpostincCNEF);
gen logHHpreincSOEP = log(HHpreincSOEP);
gen logHHpostincSOEP = log(HHpostincSOEP);


#delimit;
drop HHearningOECD HHpreincOECD HHpostincOECD;

#delimit;
gen HHearningOECD = HHearning / OECDscale;
gen HHpreearningOECD = HHpreearning / OECDscale;
gen HHpreincOECD = HHpreinc / OECDscale;
gen HHpostincOECD = HHpostinc / OECDscale;
gen HHpreincCNEFOECD = HHpreincCNEF / OECDscale;
gen HHpostincCNEFOECD = HHpostincCNEF / OECDscale;


gen logHHearningOECD = log(HHearningOECD);
gen logHHpreearningOECD = log(HHpreearningOECD);
gen logHHpreincOECD = log(HHpreincOECD);
gen logHHpostincOECD = log(HHpostincOECD);
gen logHHpreincCNEFOECD = log(HHpreincCNEFOECD);
gen logHHpostincCNEFOECD = log(HHpostincCNEFOECD);
gen logHHpreincSOEPOECD = log(HHpreincSOEPOECD);
gen logHHpostincSOEPOECD = log(HHpostincSOEPOECD);


*************************;
* Household Head
*************************;

* for couples: male;
* for non-couples: oldest male (between 25 and 60);
* otherwise: female;
***************************************************************************************;

#delimit;
preserve; 
collapse (count) age if age==18, by(yearhhnr);
rename age aged18;
sort yearhhnr;
save "${MY_TEMP_PATH}aged18.dta", replace;
restore;

#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}aged18.dta";
drop _merge;


replace aged18=0 if aged18==.;
gen older18=d11106-d11107-aged18;
tab older18;

#delimit;
gen head=.;

* single households, i.e. with only 1 member older than 18;
replace head=1 if (older18==1 & age>=19);				
replace head=0 if (older18==1 & age<19);


* identify couples;
* couple household: exactly one member d11105==1, 1 member d11105==2, both of different sex;

#delimit;
preserve;
keep if older18>=2;
collapse (sum) sex if d11105==1, by(yearhhnr);
rename sex headsex;
sort yearhhnr;
save "${MY_TEMP_PATH}headsex.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}headsex.dta";
tab _merge;
drop _merge;

preserve;
keep if older18>=2;
collapse (sum) sex if d11105==2, by(yearhhnr);
rename sex partnersex;
sort yearhhnr;
save "${MY_TEMP_PATH}partnersex.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}partnersex.dta";
tab _merge;
drop _merge;

tab headsex partnersex;


#delimit;
* couples need head and spouse present;
gen couple=1 if (headsex~=. & partnersex~=.);
#delimit;
replace head=1 if (couple==1 & male==1 & (d11105==1 | d11105==2));
replace head=0 if (couple==1 & male==1 & d11105~=1 & d11105~=2);
replace head=0 if (couple==1 & male==0);
gen spouse=0;
replace spouse=1 if (couple==1 & male==0 & (d11105==1 | d11105==2));


* for non-couple, non-single households: use sex and age to determine head;
* oldest male between 25 and 60 is head;
* if only females: oldest female bewteen 25 and 60 is head;

#delimit;
preserve;
keep if older18>=2 & couple==.;
collapse male if age>=25, by (yearhhnr);
gen malepresent=0;
replace malepresent=1 if male>0;
sort yearhhnr;
drop male;
save "${MY_TEMP_PATH}malepresent.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}malepresent.dta";
tab _merge;
drop _merge;

#delimit;
* if male present: find oldest male aged 25 to 60;
preserve;
collapse (max) age if (malepresent==1 & male==1 & age>=25 & age<=60), by(yearhhnr);
rename age maxagemale;
sort yearhhnr;
save "${MY_TEMP_PATH}maxagemale.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}maxagemale.dta";
tab _merge;
drop _merge;

#delimit;
replace head=1 if (age==maxagemale & malepresent==1 & male==1);
replace head=0 if (malepresent==1 & male==0);
replace head=0 if (age~=maxagemale & malepresent==1 & male==1);
replace head=1.6 if (malepresent==1 & male==1 & maxagemale==.);


#delimit;
* if no male present: find oldest female aged 25 to 60;
preserve;
collapse (max) age if (malepresent==0 & male==0 & age>=25 & age<=60), by(yearhhnr);
rename age maxagefemale;
sort yearhhnr;
save "${MY_TEMP_PATH}maxagefemale.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}maxagefemale.dta";
tab _merge;
drop _merge;

#delimit;
replace head=1 if (age==maxagefemale & malepresent==0 & male==0 );
replace head=0 if (age~=maxagefemale & malepresent==0 );
replace head=1.6 if (malepresent==0 & maxagefemale==. );


* double-check;
#delimit;
preserve;
collapse (sum) head, by(yearhhnr);
rename head headcheck;
sort yearhhnr;
save "${MY_TEMP_PATH}headcheck.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}headcheck.dta";
tab _merge;
drop _merge;

tab headcheck;

#delimit;
tab d11105 if headcheck==2 & head==1;
* if there is tie in age, we define as head the one who is also defined as head by household itself;

replace head=0 if (headcheck==2 & head==1 & (age<25 | age>60));
replace head=0 if (headcheck==2 & head==1 & d11105~=1);

#delimit;
preserve;
collapse (sum) head, by(yearhhnr);
rename head headcheck2;
sort yearhhnr;
save "${MY_TEMP_PATH}headcheck2.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}headcheck2.dta";
tab _merge;
drop _merge;

tab headcheck2;




*************************;
* create education dummies head-spouse;
*************************;

#delimit;
preserve;
collapse (sum) spouse, by(yearhhnr);
rename spouse spousepresent;
save "${MY_TEMP_PATH}spousepresent.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}spousepresent.dta";
tab _merge;
drop _merge;

#delimit;
preserve;
keep if (spousepresent==0 & head==1 );
collapse (sum) Hcol=college Hvoc=vocational Hsch=school Hnos=noschool, by(yearhhnr);
sort yearhhnr;
save "${MY_TEMP_PATH}educationdummieshead.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}educationdummieshead.dta";
tab _merge;
drop _merge;

#delimit;
replace Hcol=0 if spousepresent==1;
replace Hvoc=0 if spousepresent==1;
replace Hsch=0 if spousepresent==1;
replace Hnos=0 if spousepresent==1;


#delimit;
preserve;
keep if (spousepresent==1 & (head==1 | spouse==1));
gsort yearhhnr -head;
collapse (first) CHcol=college CHvoc=vocational CHsch=school  CHnos=noschool (last) CScol=college CSvoc=vocational CSsch=school  CSnos=noschool , by(yearhhnr);
sort yearhhnr;
save "${MY_TEMP_PATH}educationdummies.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}educationdummies.dta";
tab _merge;
drop _merge;

#delimit;
gen HcolScol=0;
gen HvocScol=0;
gen HschScol=0;
gen HnosScol=0;

gen HcolSvoc=0;
gen HvocSvoc=0;
gen HschSvoc=0;
gen HnosSvoc=0;

gen HcolSsch=0;
gen HvocSsch=0;
gen HschSsch=0;
gen HnosSsch=0;

gen HcolSnos=0;
gen HvocSnos=0;
gen HschSnos=0;
gen HnosSnos=0;

replace HcolScol=1 if (CHcol==1 & CScol==1);
replace HvocScol=1 if (CHvoc==1 & CScol==1);
replace HschScol=1 if (CHsch==1 & CScol==1);
replace HnosScol=1 if (CHnos==1 & CScol==1);

replace HcolSvoc=1 if (CHcol==1 & CSvoc==1);
replace HvocSvoc=1 if (CHvoc==1 & CSvoc==1);
replace HschSvoc=1 if (CHsch==1 & CSvoc==1);
#delimit;
replace HnosSvoc=1 if (CHnos==1 & CSvoc==1);

replace HcolSsch=1 if (CHcol==1 & CSsch==1);
replace HvocSsch=1 if (CHvoc==1 & CSsch==1);
replace HschSsch=1 if (CHsch==1 & CSsch==1);
replace HnosSsch=1 if (CHnos==1 & CSsch==1);

replace HcolSnos=1 if (CHcol==1 & CSnos==1);
replace HvocSnos=1 if (CHvoc==1 & CSnos==1);
replace HschSnos=1 if (CHsch==1 & CSnos==1);
replace HnosSnos=1 if (CHnos==1 & CSnos==1);

#delimit;
gen test=Hcol+Hvoc+Hsch+Hnos+HcolScol+HcolScol+HvocScol+HschScol+HnosScol+HcolSvoc+HvocSvoc+HschSvoc+HnosSvoc+HcolSsch+HvocSsch+HschSsch+HnosSsch+HcolSnos+HvocSnos+HschSnos+HnosSnos;
tab test;
tab educationmiss if test==0;
drop test;


#delimit;


**************************************************;
* assign year to year-1;
**************************************************;

replace year=year-1;


* save interim data for some analyses of sample size;
save  "${MY_TEMP_PATH}NEW_CNEF2b.dta", replace;


**************************************************;
**************************************************;
**************************************************;
* FINAL SAMPLE SELECTION;
**************************************************;
**************************************************;
**************************************************;


#delimit;
clear;
set memory 1000m;
use  "${MY_TEMP_PATH}NEW_CNEF2b.dta";



**************************************************;
* sample selection: keep only households with heads between 25 and 60;
**************************************************;

* drop those with 0 heads: all younger than 25, older than 60, or 24 individuals mentioned above;
* drop those with 1.6, 3.2, and 4.8 heads: these are the non-couple households with heads older than 60;
drop if headcheck2~=1;


#delimit;
preserve;
gen agehead=.;
replace agehead=age if (head==1);
replace agehead=0 if agehead==.;
collapse (max) agehead, by (yearhhnr);
save "${MY_TEMP_PATH}agehead.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}agehead.dta";
tab _merge;
drop _merge;
replace agehead=. if agehead==0;

tab agehead;


#delimit;
* keep only households with head between 25 and 60;
keep if agehead>=25 & agehead<=60;



**************************************************;
* sample selection: delete households where any working member has wage below 3 Euros;
**************************************************;

*keep only if any working member has credible wage;
#delimit;
gen wageflag=0;
replace wageflag=1 if (working==1 & wage>0 & wage<3);

preserve;
collapse wageflag, by(yearhhnr);
rename wageflag HHwageflag;
save "${MY_TEMP_PATH}wageflag.dta", replace;
restore;
#delimit;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}wageflag.dta";
tab _merge;
drop _merge;
drop if HHwageflag>0;


* number of households at this point;
preserve;
collapse persnr, by(yearhhnr);
sum persnr;
restore;

* check whether we are left with household without head, which should be dropped;

#delimit;
preserve;
collapse (sum) head, by(yearhhnr);
rename head headcheck3;
sort yearhhnr;
save "${MY_TEMP_PATH}headcheck3.dta", replace;
restore;
sort yearhhnr;
merge m:1 yearhhnr using "${MY_TEMP_PATH}headcheck3.dta";
tab _merge;
drop _merge;

tab headcheck3;

#delimit;
drop if headcheck3==0;



save  "${MY_OUT_PATH}NEW_CNEF3_LSsample.dta", replace;
